Please click the links below to jump to the relevant area
This project aims to apply various supervised and unsupervised machine learning algorithms.
The following steps are included within the project.
1. Dataset Selection
2. Attribute Analysis
3. Data Visualization
4. Unsupervised Machine Learning
- K-Means Clustering
- Heirachical Clustering
5. Feature Selection
6. Dimensionality Reduction
7. Test, Train, Split
8. Data Scaling
9. Supervised Machine Learning
CLASSIFICATION
- Logistic Regression
- K-Nearest Neighbors (KNN)
- Support Vector Machine
- Decision Tree
- Bagging (Boosting Aggregations)
- Random Forest
- Naiive Baye's
i. Guassian Naiive Baye's
ii. Multinomial Naiive Baye's
iii. Bernoulli Naiive Baye's
REGRESSION
- Linear Regression
#pk.eyJ1IjoiZjhheml6IiwiYSI6ImNqb3plOWp6MjA0bXIzcnFxczZ1bjdrbmwifQ.5qd5W4B06UUZc20Jax12OA
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time, plotly.plotly as py, plotly.graph_objs as go, seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
from ipywidgets import FloatProgress
import matplotlib.pyplot as plt
import multiprocessing
from IPython.core.display import display, HTML
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
from plotly import tools
import cufflinks as cf
from collections import Counter
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import decomposition
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import metrics
%matplotlib inline
%load_ext autotime
init_notebook_mode(connected=True)
# A progress bar for long running processes
# pass in total tickes needed, then update by adding 1 to object created by this function.
def __progressbar(ticks):
__bar = FloatProgress(min=0, max=ticks)
display(__bar)
return __bar
import warnings; warnings.simplefilter('ignore')
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
def run_classifiers(X, y, num_splits, rnd_state, __bar):
seed = 1
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LSVM', LinearSVC()))
models.append(('SVM', SVC()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('BAG', BaggingClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('MNB', MultinomialNB()))
models.append(('BNB', BernoulliNB()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=num_splits, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
__bar.value += 1
return results
from pylab import rcParams
def draw_confusion_matrix(y_test, y_pred):
rcParams['figure.figsize'] = 5, 5
faceLabels = ['No Fraud (0)','Fraud (1)']
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels = faceLabels, cmap="BuPu", linecolor='black', linewidths=1,
yticklabels = faceLabels)
plt.xlabel('Actual')
plt.ylabel('Predicted')
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
display(HTML('<b>Accuracy</b> = {:.2f}'.format(accuracy * 100)))
display(HTML('<b>Precision</b> = {:.2f}'.format(precision * 100)))
display(HTML('<b>Recall</b> = {:.2f}'.format(recall * 100)))
plt.show()
return accuracy, precision, recall
class RandomForestClassifierWithCoef(RandomForestClassifier):
def fit(self, *args, **kwargs):
super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
self.coef_ = self.feature_importances_
The dataset I have selected is from Kaggle. It is Paysim synthetic dataset of mobile money transactions. Each step represents an hour of simulation. It can be downloaded from the following URL:
https://www.kaggle.com/ntnu-testimon/paysim1/downloads/paysim1.zip/2
It has the following attributes:
1. step: Maps a unit of time in the real world. In this case 1 step is 1 hour of time.
2. type: CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.
3. amount: Amount of the transaction in local currency.
4. nameOrig: Customer who started the transaction.
5. oldbalanceOrg: Initial balance before the transaction.
6. newbalanceOrig: Customer's balance after the transaction.
7. nameDest: Recipient ID of the transaction.
8. oldbalanceDest: Initial recipient balance before the transaction.
9. newbalanceDest: Recipient's balance after the transaction.
10. isFraud: Identifies a fraudulent transaction (1) and non fraudulent (0).
11. isFlaggedFraud: Flags illegal attempts to transfer more than 200.000 in a single transaction.
mobile_txns_file = "ps_transactions_log.csv"
df_txns_full = pd.read_csv(mobile_txns_file)
df_txns_full.head(5)
As we can see below, type, nameOrig and nameDest are objects, meaning they will have to be converted to labels (levels) to be used in some algorithms.
step is int, however, it also needs to be encoded because it doesn't reprent a numeric value, rather a ordinal (categorical) value.
Are there any null values? If they are, they need to be cleaned up. Turns out that the data has already been cleaned up, there are no null values.
df_txns_full.isnull().any()
display(df_txns_full.describe())
display(df_txns_full.info())
It also shows that this dataset has over 6 million rows, we will reduce the number of rows to a smaller subset of data, so that the processing can be optimized.
df_fraud = df_txns_full[df_txns_full['isFraud'] == 1]
df_legit = df_txns_full[df_txns_full['isFraud'] == 0]
print("Total fraud txns: {}".format(len(df_fraud)))
print("Total legit txns: {}".format(len(df_legit)))
df_legit_sub = df_legit.head(100000 - len(df_fraud))
df_txns = pd.concat([df_legit_sub, df_fraud], axis = 0).reset_index(drop=True)
print("Selected subset: {}".format(len(df_txns)))
# shuffle rows so that fraud and legit rows are mixed
df_txns = df_txns.sample(frac=1).reset_index(drop=True)
# Save data for easy load
df_txns.to_pickle('df_txns.pkl')
isFlaggedFraud appears to be a redundant attribute. By definition, any transcation that is over $200,000 is marked as isFlaggedFraud. Let's check whether there are any transaction that is marked as isFlaggedFraud but is not marked with flag isFraud.
df_txns = pd.read_pickle('df_txns.pkl')
print(len(df_txns[(df_txns['isFlaggedFraud'] == 1) & (df_txns['isFraud'] != 1)]))
Drop the column since it is in fact redundant with isFraud
df_txns_clean1 = df_txns.drop(['isFlaggedFraud'], axis = 1)
nameOrig column is unique categorical identifier, therefore, it will not add any information to our model, therefore, it can be dropped. Similarly, nameDest is a categorical values that repeats only twice on average, doesn't provide much variance and can be dropped.
print(len(df_txns_clean1))
print(len(df_txns_clean1['nameOrig'].unique()))
print(len(df_txns_clean1['nameDest'].unique()))
df_txns_cln = df_txns_clean1.drop(['nameOrig', 'nameDest'], axis = 1)
df_txns_cln.to_pickle('df_txns_cln.pkl')
type is a categorical variable with 5 different categories, lets visualize them.
df_txns = pd.read_pickle('df_txns_cln.pkl')
countsFraud = df_txns[df_txns['isFraud']==1]['type'].value_counts()
countsLegit = df_txns[df_txns['isFraud']==0]['type'].value_counts()
data = [go.Bar(
x=countsFraud.index,
y=countsFraud.values,
name = 'Fraud'
),
go.Bar(
x=countsLegit.index,
y=countsLegit.values,
name = 'Legitimate'
)]
layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
The bar graph above shows that the only two type of transactions that suffer fraud are CASH-OUT and TRANSFER.
textd = ['isLegit' if cl==0 else 'isFraud' for cl in df_txns['isFraud']]
color_vals = [0 if cl==0 else 1 for cl in df_txns['isFraud']]
pl_colorscaled = [[0., '#119dff'],
[0.5, '#119dff'],
[0.5, '#ef553b'],
[1, '#ef553b']]
traced = go.Splom(dimensions=[dict(label='Step', values=df_txns['step']),
dict(label='Amount', values=df_txns['amount']),
dict(label='Old Balance Origin', values=df_txns['oldbalanceOrg']),
dict(label='New Balance Origin', values=df_txns['newbalanceOrig']),
dict(label='Old Balance Dest.', values=df_txns['oldbalanceDest']),
dict(label='New Balance Dest.', values=df_txns['newbalanceDest'])],
marker=dict(color=color_vals,
size=5,
colorscale=pl_colorscaled,
line=dict(width=0.5,
color='rgb(230,230,230)') ),
text=textd,
diagonal=dict(visible=False))
axisd = dict(showline=False,
zeroline=False,
gridcolor='#fff',
ticklen=4,
titlefont=dict(size=13))
title = "Scatterplot Matrix (SPLOM) for Mobile Fraud Dataset"
layout = go.Layout(title=title,
dragmode='select',
width=1000,
height=1000,
autosize=False,
hovermode='closest',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis1=dict(axisd),
xaxis2=dict(axisd),
xaxis3=dict(axisd),
xaxis4=dict(axisd),
xaxis5=dict(axisd),
xaxis6=dict(axisd),
yaxis1=dict(axisd),
yaxis2=dict(axisd),
yaxis3=dict(axisd),
yaxis4=dict(axisd),
yaxis5=dict(axisd),
yaxis6=dict(axisd))
fig = dict(data=[traced], layout=layout)
iplot(fig, filename='large')
step is an oridinal (categorical) variable. It means it can be encoded using label encoder.
lbl_encoder = LabelEncoder()
df_txns['step'] = lbl_encoder.fit_transform(df_txns['step'])
type is a categorical variable as well, however, it is nominal (not ordinal), therefore, simply applying Label Encoder won't work. We will have to create dummies out it or apply One Hot Encoder.
df_dummies = pd.get_dummies(df_txns['type'])
df_txns_d = df_txns.merge(df_dummies, left_index=True, right_index=True)
df_txns_d = df_txns_d.drop(['type'], axis = 1)
df_txns_d.to_pickle('df_txns_d.pkl')
df_txns_d.head(5)
Initially, we will start with K = 2 clusters, do the analysis and repeat with appropriate clusters
from sklearn.cluster import KMeans
df_txns_d = pd.read_pickle('df_txns_d.pkl').drop(['isFraud'], axis = 1)
k_means = KMeans(n_clusters = 2)
k_means.fit(df_txns_d)
Find labels of the K-Means predictions
labels = k_means.labels_
print(labels)
We can use inertia plot to find the best value for K parameter. Lower value of inertia corresponds to smaller clusters.
n = list(range(1,11))
inertia = []
for k in n:
k_means = KMeans(n_clusters = k)
k_means.fit(df_txns_d)
inertia.append(k_means.inertia_)
print(inertia)
iplot([{
'x': n,
'y': inertia,
'name': "K-Means K vs Inertia"
}])
This graph above shows that at K = 5, the drop in inertia slows down. Therefore, it is the ideal value for K.
k_means = KMeans(n_clusters = 5)
k_means.fit(df_txns_d)
df_txns_d['labels'] = k_means.labels_
print(k_means.cluster_centers_)
# graph in 3d for various combinations of columns that may yeild some insight
cols = [(0,1,2),(0,1,3),(0,1,4),(1,2,3),(1,4,5)]
colors = ['red','green','blue','purple','teal']
for lim in cols:
data = []
col_sets = df_txns_d.columns[[lim[0],lim[1],lim[2]]].values
print(col_sets)
for cluster in range(len(df_txns_d['labels'].unique())):
# current cluster data subset with 3 columns only
c_data = df_txns_d[df_txns_d['labels'] == cluster][col_sets]
scatterPlot = dict(
type = 'scatter3d',
mode = "markers",
name = "Cluster " + str(cluster + 1),
x = c_data.values[:,0], y = c_data.values[:,1], z = c_data.values[:,2],
marker = dict( size=2, color=colors[cluster])
)
data.append(scatterPlot)
layout = dict(
title = 'Interactive K-Means ' + ', '.join(col_sets),
scene = dict(
xaxis = dict( zeroline=True, title=col_sets[0] ),
yaxis = dict( zeroline=True, title=col_sets[1] ),
zaxis = dict( zeroline=True, title=col_sets[2] ),
)
)
iplot(dict(data = data, layout=layout))
Draw a heatmap with total clustered points for each attributes to observe which cluster contains most information regarding which attribute
clusters = ['Cluster ' + str(x) for x in list(range(0,5))]
sums = df_txns_d.groupby(['labels'], sort=True).sum()
data = [go.Heatmap( z=sums.values.tolist(),
y= clusters,
x= df_txns_d.columns.difference(['labels']).values,
colorscale='Viridis')]
iplot(data)
The above graphs and heatmap reveal that K-Means has clustered most of the data by amount and the 5 different types of transactions. This reveals that the amounts strongly correlate with the types of transactions.
Heirarchical Clustering is performed by type of each transaction for a random sample of 50 transactions
df_txns_h = pd.read_pickle('df_txns_cln.pkl')
df_txns_h.head(5)
df_txns_h_sub = df_txns_h.sample(50, random_state=2)
txn_types = list(df_txns_h_sub.pop('type'))
samples = df_txns_h_sub.values
mergings = linkage(samples, method='complete')
plt.figure(figsize=(10, 5))
dendrogram(mergings,
labels=txn_types
)
plt.show()
Heirarchical clustering shows a strong relationship between CASH-IN and CASH-OUT transactions. TRANSFER transactions are in the middle of these transactions. PAYMENTS and DEBIT have a distant relationship with other 3 types.
Data must be scaled to the same level before applying cross-validation for classifiers.
df_txns_d = pd.read_pickle('df_txns_d.pkl')
scaler = MinMaxScaler()
y = df_txns_d.pop('isFraud').values
X = df_txns_d
X_scaled = scaler.fit_transform(X)
X_scaled
Calculate cross-validation accuracy score for each classifier with Folds(K) = 10
progress_bar = __progressbar(10)
result = run_classifiers(X = X_scaled, y = y, num_splits = 10, rnd_state = 1, __bar = progress_bar)
pd.DataFrame({'results': [result]}, columns=['results']).to_pickle('df_cv.pkl')
cross_val_results = pd.read_pickle('df_cv.pkl')['results'][0]
models = ['Logistic Regression','K-Nearest Neighbors','Linear Support Vector Machine',\
'Multilinear Support Vector Machine','Decision Tree','Bagging','Random Forest',\
'Guassian Naiive Bayes','Multinomial Naiive Bayes','Bernoulli Naiive Bayes']
mean_cross_val = []
for x in cross_val_results:
mean_cross_val.append(np.mean(x))
mean_cross_val
iplot([{
'x': models,
'y': mean_cross_val,
'name': "Cross Validation Mean"
}], filename='cufflinks/classifiers-cmp')
It is clear from the graph above that Random Forest has the highest accuracy based on cross-validation score.
We will perform RFECV using Random Forest Model as it has scored the highest in cross-validation score.
Accuracy Curve
nb=RandomForestClassifierWithCoef()
rfecv = RFECV(estimator=nb, step=1, cv=StratifiedKFold(10),
scoring='accuracy')
rfecv.fit(X, y)
print(type(rfecv.grid_scores_))
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
df_ranks = pd.DataFrame({'cols': X.columns,'rank': rfecv.ranking_}).\
sort_values(['rank']).reset_index(drop=True)
df_ranks
From ranking = 1 for all attributes of the data, it is clear that all attributes must be used for analysis and none of them can be dropped from accuracy point of view.
Precision Curve
nb=RandomForestClassifierWithCoef()
rfecv = RFECV(estimator=nb, step=1, cv=StratifiedKFold(10),
scoring='precision')
rfecv.fit(X, y)
print(type(rfecv.grid_scores_))
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Precision Score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
df_ranks = pd.DataFrame({'cols': X.columns,'rank': rfecv.ranking_}).\
sort_values(['rank']).reset_index(drop=True)
df_ranks
This graph shows that highest precision can be acheive by selection first 8 attributes
Recall Curve
nb=RandomForestClassifierWithCoef()
rfecv = RFECV(estimator=nb, step=1, cv=StratifiedKFold(10),
scoring='recall')
rfecv.fit(X, y)
print(type(rfecv.grid_scores_))
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Precision Score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
df_ranks = pd.DataFrame({'cols': X.columns,'rank': rfecv.ranking_}).\
sort_values(['rank']).reset_index(drop=True)
df_ranks
It shows that highest Recall is possible with just two attributes, step and amount.
We will pick 6 components for component analysis and compare performance against original components.
df_txns_d = pd.read_pickle('df_txns_d.pkl')
y = df_txns_d.pop('isFraud').values
X = df_txns_d
pca = decomposition.PCA(n_components=6)
pca.fit(X)
X_pca = pca.transform(X)
X_pca
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.3)
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca, y, random_state=1, test_size = 0.3)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)
sc = StandardScaler()
X_pca_train_scaled = sc.fit_transform(X_pca_train)
X_pca_test_scaled = sc.transform(X_pca_test)
# Some classifiers such as Multinomial Naiive Bayes don't accept negative values
# therefore, MinMaxScaler with default range of 0 to 1 is used.
scm = MinMaxScaler()
X_train_mm = scm.fit_transform(X_train)
X_test_mm = scm.transform(X_test)
scm = MinMaxScaler()
X_pca_train_mm = scm.fit_transform(X_pca_train)
X_pca_test_mm = scm.transform(X_pca_test)
models = ['Logistic Regression','K-Nearest Neighbors','Linear Support Vector Machine',\
'Multilinear Support Vector Machine','Decision Tree','Bagging','Random Forest',\
'Guassian Naiive Bayes','Multinomial Naiive Bayes','Bernoulli Naiive Bayes']
df_stats = pd.DataFrame(models, columns=['model'])
df_stats.set_index('model')
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy, precision, recall = draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
accuracy, precision, recall = draw_confusion_matrix(y_test, y_pca_pred)
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = SVC()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = BaggingClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = GaussianNB()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
model = MultinomialNB()
model.fit(X_train_mm, y_train)
y_pred = model.predict(X_test_mm)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_mm, y_train)
y_pca_pred = model.predict(X_pca_test_mm)
draw_confusion_matrix(y_test, y_pca_pred)
model = BernoulliNB()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
draw_confusion_matrix(y_test, y_pred)
display(HTML('<b>With PCA</b>'))
model.fit(X_pca_train_scaled, y_train)
y_pca_pred = model.predict(X_pca_test_scaled)
draw_confusion_matrix(y_test, y_pca_pred)
It appears that in my case, Pricinpal Component Analysis almost always performed worst than normally scaled data.
Best performing Classifier was Bagging (Boosting Aggregation) Classifier with accuracy of 99.93%
Worst performing Classifier was Guassian Naiive Baye's with accuracy of 64.19%
For Linear Regression, we need a continuous value as label attribute. For this purpose, we will pick amount as labeled attirbtue.
df_txns_d = pd.read_pickle('df_txns_d.pkl')
y = df_txns_d.pop('amount')
X = df_txns_d
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.intercept_)
list(zip(X_train.columns.values, lr.coef_))
y_pred = lr.predict(X_test)
print(y_pred.shape)
MAE = metrics.mean_squared_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
display(HTML('<b>Mean Absolute Error</b> (MAE) = {}'.format(MAE)))
display(HTML('<b>Mean Squared Error</b> (MSE) = {}'.format(MSE)))
display(HTML('<b>Root Mean Squared Error</b> (RMSE) = {}'.format(RMSE)))
Errors shown above are clearly large, thefore, linear regression does not reveal accurate information about amount of transaction from rest of the transaction data.